--- title: Title keywords: fastai sidebar: home_sidebar nb_path: "EDA.ipynb" ---
{% raw %}
{% endraw %} {% raw %}
from glob import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow_io as tfio
{% endraw %} {% raw %}
import tensorflow as tf
{% endraw %} {% raw %}
from IPython import display
{% endraw %} {% raw %}
import random
random.seed(42)
{% endraw %} {% raw %}
ebirds = ['norcar', 'blujay', 'bkcchi']
{% endraw %} {% raw %}
files = glob('dataset/'+ebirds[0]+'/*')
{% endraw %} {% raw %}
file_shape, decoded_audio = [], []
for file in files:
    audio = tf.io.read_file(file)
    decoded_audio.append(tfio.audio.decode_mp3(audio))
    
    file_shape.append(decoded_audio[-1].shape)
{% endraw %}

Some are mono-channel, some are stereo

{% raw %}
num_channels = np.array([file_shape[i][1] for i in range(len(file_shape))])
{% endraw %} {% raw %}
np.unique(num_channels, return_counts=True)
(array([1, 2]), array([57, 43]))
{% endraw %}

Differences in file length is pretty large

{% raw %}
file_lengths = np.array([file_shape[i][0] for i in range(len(file_shape))])
{% endraw %} {% raw %}
file_lengths.max() / file_lengths.min()
148.24831712789828
{% endraw %} {% raw %}
file_lengths.argmax()
97
{% endraw %} {% raw %}
plt.hist(np.array(file_lengths))
(array([64., 26.,  4.,  1.,  2.,  0.,  1.,  1.,  0.,  1.]),
 array([  168462. ,  2649036.6,  5129611.2,  7610185.8, 10090760.4,
        12571335. , 15051909.6, 17532484.2, 20013058.8, 22493633.4,
        24974208. ]),
 <BarContainer object of 10 artists>)
{% endraw %} {% raw %}
shortest_file_idx = file_lengths.argmin()
longest_file_idx = file_lengths.argmax()
{% endraw %} {% raw %}
display.Audio(files[longest_file_idx])
{% endraw %} {% raw %}
decoded_audio_stats = {'max':[], 'min':[], 'mean':[], 'std':[], 'len':[], 'channels':[]}

for sample in decoded_audio:
    decoded_audio_stats['max'].append(sample.numpy().max())
    decoded_audio_stats['min'].append(sample.numpy().min())
    decoded_audio_stats['mean'].append(sample.numpy().mean())
    decoded_audio_stats['std'].append(sample.numpy().std())
    decoded_audio_stats['len'].append(sample.shape[0])
    decoded_audio_stats['channels'].append(sample.shape[1])
{% endraw %} {% raw %}
decoded_audio_stats['range'] = []
for sample in decoded_audio:
    decoded_audio_stats['range'].append(sample.numpy().max() - sample.numpy().min())
{% endraw %}

will need to minmaxscale the input. There're implications. What if some birds are just quieter?

{% raw %}
plt.hist(decoded_audio_stats['range'])
(array([18., 13., 13., 16., 12., 12.,  7.,  3.,  4.,  2.]),
 array([0.0691722 , 0.33101803, 0.5928639 , 0.85470974, 1.1165556 ,
        1.3784014 , 1.6402472 , 1.9020932 , 2.163939  , 2.4257848 ,
        2.6876307 ], dtype=float32),
 <BarContainer object of 10 artists>)
{% endraw %}

periods of silence. Maybe can split by the silent gaps?

{% raw %}
sample = random.choice(decoded_audio)
plt.plot(sample)
[<matplotlib.lines.Line2D at 0x149029eb0>,
 <matplotlib.lines.Line2D at 0x1493d0220>]
{% endraw %}

slight differences in the two channels. Maybe it's best to use only one channel. Sometimes the other channel has no signal

{% raw %}
plt.plot(sample[1250:1350])
[<matplotlib.lines.Line2D at 0x1494cb4f0>,
 <matplotlib.lines.Line2D at 0x149309460>]
{% endraw %}

make a prediction for a minimum window size? And take a majority vote if the recording is longer?

spectrogram

{% raw %}
sample_range = sample[:,0].numpy().max() - sample[:,0].numpy().min()
sample = sample[:,0]/(sample_range)
sample_range = sample.numpy().max() - sample.numpy().min()
sample_range
1.0
{% endraw %} {% raw %}
sample_spectrogram = tfio.experimental.audio.spectrogram(sample, nfft=512, window=512, stride=256, name='spectrogram')
{% endraw %} {% raw %}
plt.figure()
data = tf.math.log(tf.transpose(sample_spectrogram)).numpy()
plt.imshow(data, aspect='auto', origin='lower')
<matplotlib.image.AxesImage at 0x149581970>
{% endraw %} {% raw %}
plt.plot(sample)
[<matplotlib.lines.Line2D at 0x1495e3c40>]
{% endraw %}